
#############################
### 00_replace_names.R #####
###########################

library(pacman)
p_load(textreadr, dplyr, stringr, tidyr, haven, tokenizers, here, readxl, writexl, magrittr, udpipe, stringi, officer)
i_am("Code/00_replace_names.R")


# identify all names in transcripts ####
transcriptnames <- as.data.frame(list.files(path = here("DOCX Documents"), pattern = "docx"))
transcriptnames <- cbind(seq(1:nrow(transcriptnames)), transcriptnames)
transcriptnames %<>% rename(names = `list.files(path = here("DOCX Documents"), pattern = "docx")`)

all_names <- NA
for (i in 192:nrow(transcriptnames)) {
  jury<-textreadr::read_docx(here("DOCX Documents", transcriptnames$names[i]))

  tokens <- udpipe::udpipe(jury, "english")
  pns <- tokens %>% filter(upos == "PROPN") %>% group_by(lemma) %>% summarize(n=n())
  all_names <- c(all_names, pns$lemma)
  print(i)
}

all_names <- unique(all_names)

# export 1508 unique potential names
write.csv(all_names, here("names_key.csv"))
# manual review identified 513 likely names 

p_load(lexicon)

# get common first names 
f <- lexicon::freq_first_names
#  select  enough women's names
f_sample <- sample(f$Name[f$sex=="female"], 237, replace=FALSE, prob=f$prop)
write.csv(f_sample, here("f_names.csv"))
#  select  enough men's names
f_sample <- sample(f$Name[f$sex=="male"], 183, replace=FALSE, prob = f$prop)
write.csv(f_sample, here("m_names.csv"))

# get common last names
f <- lexicon::freq_last_names
f_sample <- sample(f$Surname, 92, replace=FALSE, prob=f$prop)
write.csv(f_sample, here("l_names.csv"))

# create key file with a replacement for each name in the transcripts

# replace names in transcripts with new ones ####

# read in replacements key file
key <- read.csv(here("names_key.csv"))
# add word boundaries 
key$original <- paste0("\\b", key$original, "\\b")

# read in documents, replace names, and save back out
for (i in 1:nrow(transcriptnames)) {
  # read in the document
  jury<-read_docx(here("DOCX Documents", transcriptnames$names[i]))
  
  # replace names 
  for(k in 1:nrow(key)){
  jury <- jury %>% body_replace_all_text(key$original[k], key$replacement[k])
  }
  
  # save out result
  print(jury, target = here("DOCX Documents", transcriptnames$names[i]))

  print(i)
}

# replace names in unidentified transcripts with new ones ####

# identify and clean transcript names:
transcriptnames <- list.files(here("Jury transcripts"), pattern = ".docx")

for (i in 1:length(transcriptnames)) {
  # read in the document
  jury<-read_docx(here("Jury transcripts", transcriptnames[i]))
  
  # replace names 
  for(k in 1:nrow(key)){
    jury <- jury %>% body_replace_all_text(key$original[k], key$replacement[k])
  }
  
  # save out result
  print(jury, target = here("Jury transcripts", transcriptnames[i]))
  
  print(i)
}


# replace names in intercoder reliability transcripts ####
# identify and clean transcript names:

transcriptnames <- list.files(here("ICR Transcripts"), pattern = ".docx")

for (i in 1:length(transcriptnames)) {
  # read in the document
  jury<-read_docx(here("ICR Transcripts", transcriptnames[i]))
  
  # replace names 
  for(k in 1:nrow(key)){
    jury <- jury %>% body_replace_all_text(key$original[k], key$replacement[k])
  }
  
  # save out result
  print(jury, target = here("ICR Transcripts", transcriptnames[i]))
  
  print(i)
}


# replace names in identification documents with new ones ####

dat <- read.csv(here("Data", "jurorspeech.csv"))
dat$Identifier <- stri_replace_all_regex(dat$Identifier, key$original, key$replacement, vectorize_all=FALSE)
dat$Attributed_Dialogue <- stri_replace_all_regex(dat$Attributed_Dialogue, key$original, key$replacement, vectorize_all=FALSE)
dat$Notes <- stri_replace_all_regex(dat$Notes, key$original, key$replacement, vectorize_all=FALSE)

write.csv(dat, here("Data", "jurorspeech.csv"))


dat <- readxl::read_xlsx(here("Data", "transcript_coding.xlsx"))
dat$Notes <- stri_replace_all_regex(dat$Notes, key$original, key$replacement, vectorize_all=FALSE)
write_xlsx(dat, here("Data", "transcript_coding.xlsx"))


dat <- read.csv(here("Data", "icr_identified_jurors.csv"))
dat$Identifier.1 <- stri_replace_all_regex(dat$Identifier.1, key$original, key$replacement, vectorize_all=FALSE)
dat$Identifier.2 <- stri_replace_all_regex(dat$Identifier.2, key$original, key$replacement, vectorize_all=FALSE)

write.csv(dat, here("Data", "icr_identified_jurors.csv"))
